#Set up

#s1
rm(list=ls())

setwd ("C:/Users/cecidy/Google Drive/ZH_analysis/")

source("ZHfunctions.R")
options(scipen=9999)

ZeroHunger <- read.csv("Data/ZHdf/ZeroHungerDF_2020.csv",header=TRUE, sep=",")

##################################################################################################

#s2

###############################################################################################################################

#start1_

#BF and Education (scool attendance) 04-13

#######################################################################################################################

#1. Prepare each dataframe

#############################################################################################

#Make a nonBF ZHvariable!

ZeroHunger$ZHPerCapTotal04to13for2004Analysis1000NOBF <- with(ZeroHunger,ZHPerCapTotal04to13for2004Analysis1000-BFPerCapTotal04to13for2004Analysis1000)

#######################################################################################################################

Poverty04 <- subset(ZeroHunger, PopDensity20042004Analysis<=150)

load(file = "Data/ForSamples/MPI04List.rda")
load(file = "Data/ForSamples/Basic04.rda")

RuralPoverty04 <- subset(Poverty04, select=c("BFPerCapTotal04to13for2004Analysis1000","ZHPerCapTotal04to13for2004Analysis1000NOBF",Basic04,
                                             MPI04List,"Children7to14InSchool2004Perfor2004Analysis","Children7to14InSchool2013Perfor2004Analysis"))
names(RuralPoverty04)

RuralPoverty04 <- na.omit(RuralPoverty04)

#Then take away the ones affected by rural credit
load(file = "Data/ForSamples/EcludeForRCProb04.rda")

RuralPoverty04 <- subset(RuralPoverty04,(!(IBGECode7digit %in% EcludeForRCProb04)))

#Just copy to get the "quality" dataset
RuralPoverty04quality <- RuralPoverty04

#############################################################################################

#Make logged variables

#First make MPI with the correct 0s
test <- NrOfZeroALL(RuralPoverty04quality)#This gives all the vars with 0s
MPIadd <- test[[1]]
Otheradd <- test[[2]]

WithAdded <- lapply(MPIadd,function(y) AddConstant(RuralPoverty04quality,y,addName=""))
WithAdded <- data.frame(do.call("cbind",WithAdded))

RuralPoverty04quality[,MPIadd] <- WithAdded[,MPIadd]#just override

#Then need to make the MPI - OBS 04 FUNCTION
RuralPoverty04quality <- MakeMPI04(RuralPoverty04quality)

#Then for the others I cbind them on, and because Ive not included any of previous noCero there no duplicates
RuralPoverty04quality <- cbind(RuralPoverty04quality,lapply(Otheradd,function(y) AddConstant(RuralPoverty04quality,y,addName="nocero")))

###########################################################################################################################

#Log variables
OtheraddCero <- paste(Otheradd,"nocero",sep="")

cols.log <- c("BFPerCapTotal04to13for2004Analysis1000","ZHPerCapTotal04to13for2004Analysis1000NOBF","MPI2004for2004Analysis","MPI2013for2004Analysis",
              "GDPPerCapPublicrealN2004for2004Analysis1000",OtheraddCero,"RemoteMinPopFifty2004Analysis",
              "PopDensity20042004Analysis","SizeKm22004Analysis","MeanSlopefor2004analysis","MeanElevationfor2004analysis")
addlog <- function(x) log10(x)
Log10variables <- data.frame(sapply(RuralPoverty04quality[cols.log],addlog))
colnames(Log10variables) <- paste(colnames(Log10variables), "log10",sep="")

#Get all onto ZH
RuralPoverty04quality <- cbind(RuralPoverty04quality,Log10variables)
head(RuralPoverty04quality)

###########################################################################################################################

#Remove those I dont need
Log10variables <- NULL
Poverty04 <- NULL
TempVars <- NULL
WithAdded <- NULL

#####################################################################

#Make scaled and logged ZH variables, and set contrasts (obs contrast is also set afresh in analysis script)

RuralPoverty04quality<- getDataFrame(RuralPoverty04quality,"contr.Sum",
                                     contrastVar1="stateName",contrastVar2="Biomefor2004Analysis",setRefToMostCommonLevel,
                                     BaseVar="BFPerCapTotal04to13for2004Analysis1000log10",
                                     MainName="BFPerCapTotal04to13for2004Analysis1000log10scaledMain")

#fin1_

#################################################################################################

#start2_

#Prepare vectors of variables for regressions

###############################################################################################################

Depvar <- "Children7to14InSchool2013Perfor2004Analysis"

IndepVarLin <- "BFPerCapTotal04to13for2004Analysis1000log10scaledMain"#already scaled so dont need to do it again

baselineDepVar <- "scale(Children7to14InSchool2004Perfor2004Analysis)"

stateName <- "stateName"

intLin <- c("BFPerCapTotal04to13for2004Analysis1000log10scaledMain","stateName")
intLin <- paste(intLin,collapse="*")

xvars <- c("scale(MPI2004for2004Analysislog10)","scale(GDPPerCapPublicrealN2004for2004Analysis1000log10)",
           "scale(TotalKcal2004percapperdaynocerolog10)",
           "scale(TotalHectareCrops2004for2004Analysisnocerolog10)","scale(TotalHectarePasture2006for2004Analysisnocerolog10)",
           "scale(TotalHectareFarmsLess502006for2004Analysisnocerolog10)","scale(RemoteMinPopFifty2004Analysislog10)",
           "scale(ChangeCummulativeSPEI02and04to11and13for2004Analysis)","scale(TotRCpcNOPRONAF04to13for2004Analysis1000nocerolog10)",
           "scale(MeanSlopefor2004analysislog10)","scale(MeanElevationfor2004analysislog10)",
           "scale(PopDensity20042004Analysislog10)","scale(SizeKm22004Analysislog10)","scale(ZHPerCapTotal04to13for2004Analysis1000NOBFlog10)",
           "Biomefor2004Analysis")


#For CBPS models
cbpsDepVar <- "BFPerCapTotal04to13for2004Analysis1000log10"
cbpsbaselineDepVar <- "Children7to14InSchool2004Perfor2004Analysis"

cbpsVars <- c("MPI2004for2004Analysislog10","GDPPerCapPublicrealN2004for2004Analysis1000log10","TotalKcal2004percapperdaynocerolog10",
              "TotalHectareCrops2004for2004Analysisnocerolog10","TotalHectarePasture2006for2004Analysisnocerolog10",
              "TotalHectareFarmsLess502006for2004Analysisnocerolog10","RemoteMinPopFifty2004Analysislog10",
              "ChangeCummulativeSPEI02and04to11and13for2004Analysis","TotRCpcNOPRONAF04to13for2004Analysis1000nocerolog10",
              "MeanSlopefor2004analysislog10","MeanElevationfor2004analysislog10",
              "PopDensity20042004Analysislog10","SizeKm22004Analysislog10","ZHPerCapTotal04to13for2004Analysis1000NOBFlog10",
              "stateName","Biomefor2004Analysis")

#########################################################################################################################

#fin2_

################################################################################################################################

#READ IN THE STUFF ABOVE

###############################################################################################################################
###############################################################################################################################

#Here I source the above AND get in the election variable so that I am ready to make the weights and final model!

prepareFunc <- function(namefolders,script,yearsample,timeperiod,progr,biomvar){
  result=NULL
  {
    #1.Load in the df
    script <- paste(namefolders,script,sep="")
    
    sourcePartial(script,startTag="#s1",endTag="#s2")
    sourcePartial(script,startTag="#start1_",endTag="#fin1_")
    sourcePartial(script,"#start2_","#fin2_")
    
    print(c(cbpsDepVar,cbpsbaselineDepVar))
    
    #2. Get on the election variable
    ElecDF <- read.csv(paste("Data/Elections/AllElectionPerMun_",yearsample,sep="",paste("sample_Fin.csv",sep="")),
                       header=TRUE, sep=",")
    
    ElecDFsub <- subset(ElecDF, select=c("codeibge",paste("PresElec_vs_",timeperiod,sep="",paste("_w_",progr,sep=""))))
    names(ElecDFsub)[1] <- "IBGECode7digit"
    
    #it has to be the same sample as rural!
    RuralPoverty04quality <- merge(x=RuralPoverty04quality,y=ElecDFsub,by="IBGECode7digit",all.x=T)
    
    #sample information
    print(length(RuralPoverty04quality[,1]))
    elecvar <- length(names(RuralPoverty04quality))
    print(length(which(is.na(RuralPoverty04quality[,elecvar]))))
    RuralPoverty04quality <- RuralPoverty04quality[!is.na(RuralPoverty04quality[,elecvar]),]
    print(length(RuralPoverty04quality[,1]))
    
    #extra - set the correct reference state/biome
    #2. Get the reference levels (want this to be the ones with lowest)
    #2.1. Cannot have empty levels
    RuralPoverty04quality$stateName <- droplevels(RuralPoverty04quality$stateName)
    RuralPoverty04quality[,biomvar] <- droplevels(RuralPoverty04quality[,biomvar])
    
    #2.2. Find the category with least observations
    
    #State
    onetabS <- data.frame(table(RuralPoverty04quality[,"stateName"]))
    onetabS <- onetabS [order(onetabS$Freq),]
    onetabS$Var1 <- as.character(onetabS$Var1)
    
    smallestS <- onetabS[1,1]
    
    #Biome
    onetabB <- data.frame(table(RuralPoverty04quality[,biomvar]))
    onetabB <- onetabB [order(onetabB$Freq),]
    onetabB$Var1 <- as.character(onetabB$Var1)
    
    smallestB <- onetabB[1,1]
    
    #2.3. Set the smallest state/biome as the reference (the last one)
    
    #state Reference
    stateSeq <- levels(RuralPoverty04quality$stateName)
    stateSeqNoRef <- stateSeq [! stateSeq %in% smallestS]
    stateSeqFin <- c(stateSeqNoRef,smallestS)#this ensures that the smallest one is last!
    
    #biome Reference
    biomeSeq <- levels(RuralPoverty04quality[,biomvar])
    biomeSeqNoRef <- biomeSeq [! biomeSeq %in% smallestB]
    biomeSeqFin <- c(biomeSeqNoRef,smallestB)#this ensures that the smallest one is last!
    
    #set the level
    RuralPoverty04quality$stateName <- factor(RuralPoverty04quality$stateName, levels = stateSeqFin)
    RuralPoverty04quality[,biomvar] <- factor(RuralPoverty04quality[,biomvar], levels = biomeSeqFin)
    
    #2.4. Set the contrast
    contrasts(RuralPoverty04quality$stateName) <- "contr.Sum"
    contrasts(RuralPoverty04quality[,biomvar]) <- "contr.Sum"
    
    #Then it is ready to make weights and model!
    
    result <- RuralPoverty04quality
  }
}

RuralPoverty04quality <- prepareFunc("ProgrammeOutcome/BF/Education04to13/",
                                     "25.BFandEducation04.R","2004","2004to2013","BF",
                                     "Biomefor2004Analysis")
head(RuralPoverty04quality)

###############################################################################################################################
###############################################################################################################################

#First look at transformation/getting it normal

###############################################################################################################

#I look at transformation first

hist(RuralPoverty04quality$Children7to14InSchool2013Perfor2004Analysis)
#some skew but not as bad as in census, and robust models dealt with outliers fine

#######################################################################################################

#Untransformed
Depvar <- "Children7to14InSchool2013Perfor2004Analysis"
DepVarLog <- "Children7to14InSchool2013Perfor2004Analysis"

baselineDepVar <- "scale(Children7to14InSchool2004Perfor2004Analysis)"
cbpsbaselineDepVar <- "Children7to14InSchool2004Perfor2004Analysis"

#######################################################################################################

#3.Make cbps formula, add the variable and create a new formula
my.formula <- as.formula(reformulate(termlabels=c(cbpsbaselineDepVar,cbpsVars),response=cbpsDepVar))
addTerm <- paste("PresElec_vs_","2004to2013",sep="",paste("_w_","BF",sep=""))
newmodformula <- update(my.formula,paste("~. +", addTerm))#makes a new formula with the added variable

######################################################################################################

#4. Make the CBPS weight
fit2 <- CBPS(newmodformula, data=RuralPoverty04quality)
CBPStabletest <- CBPStable(fit2,"Education04")
#the improvement is good

#5. But I would want to be able to save this weight and name it something different
save(fit2, file = paste("ProgrammeOutcome/BF/Education04to13/","fit2fullcoreElectionW",sep="",paste(".rda",sep="")))

#########################################################################################################
#########################################################################################################

#Run regression, manually!

cbpsweights <- fit2

FullList <- FullModListRedElectionScale("2004to2013","BF",Depvar,IndepVarLin, baselineDepVar,stateName, xvars,intLin)

##########################################################################################################

#Linear
my.formula <- FullList[[1]]
Modlinear <- lmrob( my.formula,weights = fit2$weights, data=RuralPoverty04quality,fast.s.large.n = Inf,max.it=500,k.max=2000)
summary(Modlinear)

#State linear
my.formula <- FullList[[3]]
ModStatelinear <- lmrob( my.formula,weights = fit2$weights, data=RuralPoverty04quality,fast.s.large.n = Inf,max.it=500,k.max=2000)
summary(ModStatelinear)

summary(ModStatelinear)$adj.r-summary(Modlinear)$adj.r#State linear model is better
Anova(ModStatelinear, type="3")#yes Anova is significant

#So state linear model is the best model

################################################################################

#save the two
save(Modlinear, file = "ProgrammeOutcome/BF/Education04to13/ModlinearElection.rda")
save(ModStatelinear, file = "ProgrammeOutcome/BF/Education04to13/ModStatelinearElection.rda")

##################################################################################

